API Evolution and Breaking Changes Analysis¶

This notebook analyzes the dataset in data/, where each library has:

  • library-commits.csv: commit-level project/API metrics
  • library-bcs.csv: per-breaking-change records

It focuses on commit dynamics, API growth, breaking-change intensity, compatibility impact, and the role of excluded/internal/deprecated symbols.

In [1]:
from pathlib import Path
import math
import re

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display

sns.set_theme(style="whitegrid", context="talk")
pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 220)

DATA_DIR = Path("data")
if not DATA_DIR.exists():
    raise FileNotFoundError("Expected directory 'data/' with split CSV files.")

print(f"Using data directory: {DATA_DIR.resolve()}")
print(f"pandas={pd.__version__}, seaborn={sns.__version__}")
Using data directory: /home/dig/repositories/roseau-full-bench/results/use_case/walk/data
pandas=2.3.3, seaborn=0.13.2
In [2]:
BOOL_COLS_COMMITS = ["is_merge_commit", "has_java_changes", "has_pom_changes"]
BOOL_COLS_BCS = ["is_excluded_symbol", "is_deprecated_removal", "is_internal_removal"]
NUMERIC_COLS_COMMITS = [
    "days_since_prev_commit", "files_changed", "loc_added", "loc_deleted",
    "all_api_types_count", "all_api_methods_count", "all_api_fields_count", "all_api_symbols_count",
    "exported_types_count", "exported_methods_count", "exported_fields_count",
    "deprecated_count", "internal_count",
    "breaking_changes_count", "binary_breaking_changes_count", "source_breaking_changes_count",
    "checkout_time_ms", "classpath_time_ms", "api_time_ms", "diff_time_ms", "stats_time_ms",
]


def parse_bool_series(s: pd.Series) -> pd.Series:
    vals = s.astype(str).str.lower().map({"true": True, "false": False})
    return vals.eq(True)


def load_split_data(data_dir: Path):
    commits_frames = []
    bcs_frames = []

    commit_files = sorted(data_dir.glob("*-commits.csv"))
    bc_files = sorted(data_dir.glob("*-bcs.csv"))

    libs_from_commits = {p.name.replace("-commits.csv", "") for p in commit_files}
    libs_from_bcs = {p.name.replace("-bcs.csv", "") for p in bc_files}

    missing_pairs = sorted(libs_from_commits.symmetric_difference(libs_from_bcs))
    if missing_pairs:
        print("Warning: some libraries do not have both files:", missing_pairs)

    for path in commit_files:
        lib = path.name.replace("-commits.csv", "")
        df = pd.read_csv(path)
        if "library" not in df.columns:
            df["library"] = lib

        df["date_utc"] = pd.to_datetime(df["date_utc"], errors="coerce", utc=True)
        df = df.dropna(subset=["date_utc"]).sort_values("date_utc").reset_index(drop=True)

        for c in NUMERIC_COLS_COMMITS:
            if c in df.columns:
                df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0)

        for c in BOOL_COLS_COMMITS:
            if c in df.columns:
                df[c] = parse_bool_series(df[c])

        # Derived commit-level features
        df["net_loc"] = df["loc_added"] - df["loc_deleted"]
        df["abs_loc_churn"] = df["loc_added"] + df["loc_deleted"]
        df["exported_symbols_count"] = (
            df["exported_types_count"] + df["exported_methods_count"] + df["exported_fields_count"]
        )
        df["all_symbols_count"] = (
            df["all_api_types_count"] + df["all_api_methods_count"] + df["all_api_fields_count"]
        )
        df["breaks_per_1k_exported"] = (
            df["breaking_changes_count"] / df["exported_symbols_count"].replace(0, np.nan) * 1000
        ).replace([np.inf, -np.inf], np.nan).fillna(0)
        df["internal_share"] = (
            df["internal_count"] / df["all_symbols_count"].replace(0, np.nan)
        ).replace([np.inf, -np.inf], np.nan).fillna(0)
        df["deprecated_share"] = (
            df["deprecated_count"] / df["exported_symbols_count"].replace(0, np.nan)
        ).replace([np.inf, -np.inf], np.nan).fillna(0)

        commits_frames.append(df)

    for path in bc_files:
        lib = path.name.replace("-bcs.csv", "")
        df = pd.read_csv(path)
        if "library" not in df.columns:
            df["library"] = lib

        for c in BOOL_COLS_BCS:
            if c in df.columns:
                df[c] = parse_bool_series(df[c])

        bcs_frames.append(df)

    commits = pd.concat(commits_frames, ignore_index=True) if commits_frames else pd.DataFrame()
    bcs = pd.concat(bcs_frames, ignore_index=True) if bcs_frames else pd.DataFrame()

    return commits, bcs, sorted(libs_from_commits.intersection(libs_from_bcs)), missing_pairs


commits_df, bcs_df, libraries, missing_pairs = load_split_data(DATA_DIR)

print(f"Libraries with complete pairs: {len(libraries)}")
print(f"Commit rows: {len(commits_df):,}")
print(f"Breaking-change rows: {len(bcs_df):,}")
if missing_pairs:
    print("Libraries missing one side:", missing_pairs)
Libraries with complete pairs: 35
Commit rows: 146,334
Breaking-change rows: 658,235

Data Quality and Coverage¶

In [3]:
coverage = commits_df.groupby("library").agg(
    commits=("commit_sha", "count"),
    first_commit=("date_utc", "min"),
    last_commit=("date_utc", "max"),
    commits_with_breaks=("breaking_changes_count", lambda s: int((s > 0).sum())),
    total_reported_breaks=("breaking_changes_count", "sum"),
    final_exported_symbols=("exported_symbols_count", "last"),
    final_internal_symbols=("internal_count", "last"),
    merge_commit_rate=("is_merge_commit", "mean"),
    java_change_rate=("has_java_changes", "mean"),
).sort_values("total_reported_breaks", ascending=False)

# Cross-check: commit-level break count vs row-level BC records
bc_counts_from_rows = bcs_df.groupby("library").size().rename("bc_rows")
coverage = coverage.join(bc_counts_from_rows, how="left").fillna({"bc_rows": 0})
coverage["bc_count_delta"] = coverage["total_reported_breaks"] - coverage["bc_rows"]

coverage.head(20)
Out[3]:
commits first_commit last_commit commits_with_breaks total_reported_breaks final_exported_symbols final_internal_symbols merge_commit_rate java_change_rate bc_rows bc_count_delta
library
assertj-core 3672 2010-09-07 04:06:36+00:00 2026-02-07 22:12:04+00:00 1001 230771 6932 85 0.049564 0.512527 230771 0
jackson-databind 6582 2011-12-23 08:31:35+00:00 2026-02-08 04:52:24+00:00 2007 145244 9122 0 0.421908 0.822546 145244 0
rxjava-core 3499 2013-01-09 06:21:43+00:00 2026-02-06 08:29:11+00:00 541 128110 4254 1850 0.335810 0.661332 128110 0
hibernate-core 4826 2010-10-11 19:41:47+00:00 2016-08-10 05:41:04+00:00 846 34244 25810 9464 0.016162 0.846042 34244 0
h2-database 8024 2006-12-15 00:12:44+00:00 2026-01-31 05:09:18+00:00 1384 34007 10719 0 0.192547 0.871386 34007 0
tomcat 27700 2006-03-27 13:53:46+00:00 2026-02-06 22:39:49+00:00 3287 33514 23836 0 0.002310 0.750433 33514 0
commons-collections 4856 2001-04-14 15:38:58+00:00 2026-02-06 14:23:17+00:00 406 9963 3713 0 0.022035 0.678336 9963 0
jackson-core 2572 2011-12-23 07:00:40+00:00 2026-02-07 21:49:29+00:00 424 7384 3002 0 0.422628 0.683904 7384 0
joda-time 2132 2003-12-16 21:39:27+00:00 2026-01-21 20:45:38+00:00 178 3676 3045 0 0.043152 0.568011 3676 0
netty-codec-http 9661 2011-12-28 10:44:04+00:00 2026-02-06 10:10:55+00:00 178 3392 2654 0 0.008281 0.840182 3392 0
commons-pool 2804 2001-04-14 16:40:29+00:00 2026-02-06 14:39:30+00:00 240 3350 609 0 0.016762 0.529601 3350 0
httpcomponents-client 3170 2009-03-01 16:36:52+00:00 2026-02-07 18:00:09+00:00 264 3116 2382 77 0.004416 0.782334 3116 0
commons-beanutils 2196 2001-03-27 05:25:57+00:00 2026-02-06 14:21:23+00:00 133 2285 758 0 0.013661 0.518215 2285 0
commons-lang 8614 2002-07-19 03:35:56+00:00 2026-02-07 20:01:43+00:00 396 2274 4284 0 0.024379 0.717088 2274 0
guava 6972 2011-04-15 17:22:23+00:00 2026-02-07 00:34:41+00:00 378 2037 5074 39 0.000143 0.897734 2037 0
fastjson2-core 4597 2022-04-17 05:16:05+00:00 2026-02-08 00:22:56+00:00 367 2000 4910 452 0.048727 0.774636 2000 0
fastjson-core 2934 2011-07-31 12:05:24+00:00 2023-05-12 06:16:03+00:00 322 1829 2124 0 0.140082 0.914110 1829 0
log4j-api 9835 2013-08-26 12:21:54+00:00 2026-01-22 10:08:42+00:00 145 1546 2200 90 0.032028 0.614540 1546 0
commons-compress 5436 2003-11-23 20:07:47+00:00 2026-02-06 14:23:42+00:00 213 1370 3706 27 0.017476 0.733996 1370 0
commons-io 5560 2002-01-26 02:47:42+00:00 2026-02-07 21:09:27+00:00 187 1209 2505 0 0.016187 0.680576 1209 0
In [4]:
fig, axes = plt.subplots(1, 2, figsize=(18, max(7, 0.45 * len(coverage))))

commits_plot = coverage.sort_values("commits", ascending=False).reset_index()
sns.barplot(data=commits_plot, y="library", x="commits", ax=axes[0], color="#2a9d8f")
axes[0].set_title("Libraries by Commit Count")
axes[0].set_xlabel("Commits analyzed")
axes[0].set_ylabel("")

# Use row-level BC counts for direct event volume comparison
bcs_plot = coverage.sort_values("bc_rows", ascending=False).reset_index()
sns.barplot(data=bcs_plot, y="library", x="bc_rows", ax=axes[1], color="#e76f51")
axes[1].set_title("Libraries by Breaking-Change Events")
axes[1].set_xlabel("Breaking-change records")
axes[1].set_ylabel("")

plt.tight_layout()
plt.show()
No description has been provided for this image

Per-Library Timeline: API Size and Breaking Changes¶

  • API size evolution (all_api_symbols_count and exported_symbols_count)
  • breaking-change introductions as vertical red bars at commit timestamps
In [5]:
# Per-library timeline plot (API size + vertical BC bars per commit)
libs_order = coverage.sort_values("commits", ascending=False).index.tolist()

for lib in libs_order:
    d = commits_df[commits_df["library"] == lib].sort_values("date_utc").copy()
    if d.empty:
        continue

    fig, ax = plt.subplots(figsize=(14, 4.5))

    # API size evolution
    ax.plot(d["date_utc"], d["all_api_symbols_count"], color="#1d3557", linewidth=1.6, label="All API symbols")
    ax.plot(d["date_utc"], d["exported_symbols_count"], color="#2a9d8f", linewidth=1.6, label="Exported symbols")

    # Vertical red bars for commits with breaking changes
    bc_commits = d[d["breaking_changes_count"] > 0]
    if not bc_commits.empty:
        ymax = max(float(d["all_api_symbols_count"].max()), 1.0)
        # bar height scales with break count to preserve intensity signal
        bar_top = (bc_commits["breaking_changes_count"] / bc_commits["breaking_changes_count"].max()) * ymax
        ax.vlines(
            bc_commits["date_utc"],
            ymin=0,
            ymax=bar_top,
            color="red",
            alpha=0.45,
            linewidth=1.0,
            label="Breaking change commit" if len(bc_commits) else None,
        )

    ax.set_title(f"{lib}: API evolution and breaking-change introductions")
    ax.set_xlabel("Date")
    ax.set_ylabel("Symbol count")
    ax.grid(alpha=0.25)

    handles, labels = ax.get_legend_handles_labels()
    # deduplicate legend labels
    uniq = dict(zip(labels, handles))
    ax.legend(uniq.values(), uniq.keys(), loc="upper left")

    plt.tight_layout()
    plt.show()
    plt.close(fig)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

API Evolution and Break Risk¶

In [6]:
# Libraries with largest final exported APIs
focus_libs = (
    coverage.sort_values("final_exported_symbols", ascending=False)
    .index
    .tolist()
)

fig, axes = plt.subplots(18, 2, figsize=(18, 100), sharex=False)
axes = axes.flatten()

for i, lib in enumerate(focus_libs):
    d = commits_df[commits_df["library"] == lib].copy().sort_values("date_utc")
    if d.empty:
        axes[i].axis("off")
        continue

    d["rolling_break_risk"] = d["breaks_per_1k_exported"].rolling(window=60, min_periods=10).mean()

    ax = axes[i]
    ax.plot(d["date_utc"], d["exported_symbols_count"], label="Exported symbols", color="#264653", linewidth=2)
    ax.plot(d["date_utc"], d["internal_count"], label="Internal symbols", color="#8ab17d", linewidth=1.6)
    ax.set_title(lib)
    ax.set_ylabel("Symbol count")
    ax.grid(alpha=0.25)

    ax2 = ax.twinx()
    ax2.plot(d["date_utc"], d["rolling_break_risk"], label="Rolling breaks/1k exported", color="#e76f51", linewidth=1.6)
    ax2.set_ylabel("Breaks per 1k")

    lines, labels = ax.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax.legend(lines + lines2, labels + labels2, loc="upper left", fontsize=9)

for j in range(len(focus_libs), len(axes)):
    axes[j].axis("off")

plt.suptitle("API size evolution and normalized breaking-change risk", y=1.02)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [7]:
sample = commits_df[["library", "abs_loc_churn", "files_changed", "breaking_changes_count", "breaks_per_1k_exported"]].copy()
if len(sample) > 100_000:
    sample = sample.sample(100_000, random_state=42)

fig, axes = plt.subplots(1, 2, figsize=(18, 7))

sns.scatterplot(
    data=sample,
    x="abs_loc_churn",
    y="breaking_changes_count",
    hue="library",
    alpha=0.25,
    linewidth=0,
    s=18,
    legend=False,
    ax=axes[0],
)
axes[0].set_xscale("symlog", linthresh=1)
axes[0].set_yscale("symlog", linthresh=1)
axes[0].set_title("Code churn vs breaking changes per commit")
axes[0].set_xlabel("LOC churn (added + deleted)")
axes[0].set_ylabel("Breaking changes")

sns.scatterplot(
    data=sample,
    x="files_changed",
    y="breaks_per_1k_exported",
    hue="library",
    alpha=0.25,
    linewidth=0,
    s=18,
    legend=False,
    ax=axes[1],
)
axes[1].set_xscale("symlog", linthresh=1)
axes[1].set_yscale("symlog", linthresh=0.1)
axes[1].set_title("Files changed vs normalized break risk")
axes[1].set_xlabel("Files changed")
axes[1].set_ylabel("Breaks per 1k exported symbols")

plt.tight_layout()
plt.show()
No description has been provided for this image

Breaking-Change Taxonomy¶

In [8]:
# Top kind distribution by library
kind_counts = (
    bcs_df.groupby(["library", "kind"]).size().rename("count").reset_index()
)

top_kinds = bcs_df["kind"].value_counts().head(15).index
kind_matrix = (
    kind_counts[kind_counts["kind"].isin(top_kinds)]
    .pivot(index="library", columns="kind", values="count")
    .fillna(0)
)
kind_matrix = kind_matrix.loc[kind_matrix.sum(axis=1).sort_values(ascending=False).index]

plt.figure(figsize=(18, 20))
sns.heatmap(np.log1p(kind_matrix), cmap="YlOrRd", cbar_kws={"label": "log(1 + BC events)"})
plt.title("Breaking-change kind intensity by library (top 15 kinds)")
plt.xlabel("Breaking-change kind")
plt.ylabel("Library")
plt.tight_layout()
plt.show()
No description has been provided for this image
In [9]:
compat = (
    bcs_df.groupby(["library", "compatibility"]).size().rename("count").reset_index()
)
compat["share"] = compat.groupby("library")["count"].transform(lambda s: s / s.sum())

top_libs_for_compat = coverage.sort_values("bc_rows", ascending=False).head(12).index
plot_compat = compat[compat["library"].isin(top_libs_for_compat)]

plt.figure(figsize=(18, 8))
sns.barplot(data=plot_compat, x="library", y="share", hue="compatibility")
plt.title("Compatibility profile by library (top 12 by BC count)")
plt.xlabel("Library")
plt.ylabel("Share of breaking changes")
plt.xticks(rotation=70, ha="right")
plt.tight_layout()
plt.show()
No description has been provided for this image

Excluded, Internal, and Deprecated-Removal Signals¶

In [10]:
flags = (
    bcs_df.groupby("library").agg(
        total_bcs=("kind", "count"),
        excluded_bcs=("is_excluded_symbol", "sum"),
        internal_removals=("is_internal_removal", "sum"),
        deprecated_removals=("is_deprecated_removal", "sum"),
    )
    .reset_index()
)

for col in ["excluded_bcs", "internal_removals", "deprecated_removals"]:
    flags[f"{col}_share"] = (flags[col] / flags["total_bcs"].replace(0, np.nan)).fillna(0)

flag_plot = flags.sort_values("total_bcs", ascending=False)
long_flag = flag_plot.melt(
    id_vars=["library", "total_bcs"],
    value_vars=["excluded_bcs_share", "internal_removals_share", "deprecated_removals_share"],
    var_name="metric",
    value_name="share",
)

name_map = {
    "excluded_bcs_share": "Excluded symbol BC share",
    "internal_removals_share": "Internal removal share",
    "deprecated_removals_share": "Deprecated-removal share",
}
long_flag["metric"] = long_flag["metric"].map(name_map)

plt.figure(figsize=(20, 8))
sns.barplot(data=long_flag, x="library", y="share", hue="metric")
plt.title("Excluded/internal/deprecated-removal ratios by library")
plt.xlabel("Library")
plt.ylabel("Share of breaking-change events")
plt.xticks(rotation=70, ha="right")
plt.tight_layout()
plt.show()

flags.sort_values("excluded_bcs_share", ascending=False).head(10)
No description has been provided for this image
Out[10]:
library total_bcs excluded_bcs internal_removals deprecated_removals excluded_bcs_share internal_removals_share deprecated_removals_share
32 rxjava-core 128110 100935 2773 1879 0.787878 0.021645 0.014667
15 gson 470 353 121 0 0.751064 0.257447 0.000000
16 guava 2037 1488 189 278 0.730486 0.092784 0.136475
19 hibernate-core 34244 5369 2727 730 0.156787 0.079634 0.021318
14 fastjson2-core 2000 151 137 3 0.075500 0.068500 0.001500
1 assertj-core 230771 9699 6993 1397 0.042029 0.030303 0.006054
28 log4j-api 1546 64 12 1 0.041397 0.007762 0.000647
20 httpcomponents-client 3116 124 22 214 0.039795 0.007060 0.068678
18 hamcrest-core 435 8 8 1 0.018391 0.018391 0.002299
27 jsoup 919 14 8 51 0.015234 0.008705 0.055495
In [11]:
vis = (
    bcs_df.groupby(["library", "symbol_visibility"]).size().rename("count").reset_index()
)
vis["share"] = vis.groupby("library")["count"].transform(lambda s: s / s.sum())

top_libs = coverage.sort_values("bc_rows", ascending=False).head(10).index
vis_plot = vis[vis["library"].isin(top_libs)]

plt.figure(figsize=(18, 8))
sns.barplot(data=vis_plot, x="library", y="share", hue="symbol_visibility")
plt.title("Visibility of impacted symbols in breaking changes")
plt.xlabel("Library")
plt.ylabel("Share")
plt.xticks(rotation=65, ha="right")
plt.tight_layout()
plt.show()
No description has been provided for this image

Temporal Patterns and Concentration¶

In [12]:
monthly_commits = commits_df.copy()
monthly_commits["month"] = monthly_commits["date_utc"].dt.tz_convert(None).dt.to_period("M").astype(str)

monthly = monthly_commits.groupby(["library", "month"]).agg(
    monthly_commits=("commit_sha", "count"),
    monthly_breaks=("breaking_changes_count", "sum"),
    monthly_abs_churn=("abs_loc_churn", "sum"),
).reset_index()
monthly["breaks_per_100_commits"] = (monthly["monthly_breaks"] / monthly["monthly_commits"].replace(0, np.nan) * 100).fillna(0)

# Heatmap of monthly break intensity for most active libraries
active_libs = coverage.sort_values("commits", ascending=False).index
heat = monthly[monthly["library"].isin(active_libs)].pivot(index="library", columns="month", values="breaks_per_100_commits").fillna(0)

plt.figure(figsize=(22, 18))
sns.heatmap(np.log1p(heat), cmap="rocket_r", cbar_kws={"label": "log(1 + breaks per 100 commits)"})
plt.title("Temporal break intensity (most active libraries)")
plt.xlabel("Month")
plt.ylabel("Library")
plt.tight_layout()
plt.show()
No description has been provided for this image
In [13]:
# Pareto concentration: how many commits explain 80% of breaking changes in each library
pareto_rows = []
for lib, d in commits_df.groupby("library"):
    s = d["breaking_changes_count"].sort_values(ascending=False).reset_index(drop=True)
    total = s.sum()
    if total <= 0:
        pareto_rows.append({"library": lib, "commits_for_80pct_breaks": np.nan, "total_commits": len(d)})
        continue
    csum = s.cumsum()
    needed = int((csum < (0.8 * total)).sum() + 1)
    pareto_rows.append({"library": lib, "commits_for_80pct_breaks": needed, "total_commits": len(d)})

pareto = pd.DataFrame(pareto_rows)
pareto["share_of_commits_for_80pct_breaks"] = (
    pareto["commits_for_80pct_breaks"] / pareto["total_commits"].replace(0, np.nan)
)
pareto = pareto.sort_values("share_of_commits_for_80pct_breaks")

plt.figure(figsize=(16, 7))
sns.barplot(data=pareto.dropna(subset=["share_of_commits_for_80pct_breaks"]), x="library", y="share_of_commits_for_80pct_breaks", color="#457b9d")
plt.title("Concentration of breaking changes (lower means more concentrated)")
plt.xlabel("Library")
plt.ylabel("Commit share needed to accumulate 80% of breaks")
plt.xticks(rotation=70, ha="right")
plt.tight_layout()
plt.show()

pareto.head(15)
No description has been provided for this image
Out[13]:
library commits_for_80pct_breaks total_commits share_of_commits_for_80pct_breaks
0 JSON-java 1 389 0.002571
28 log4j-api 26 9835 0.002644
24 jakartaee-servlet-api 2 553 0.003617
29 netty-codec-http 43 9661 0.004451
12 commons-text 14 2322 0.006029
30 protobuf-java 10 1151 0.008688
4 commons-cli 16 1838 0.008705
8 commons-io 50 5560 0.008993
7 commons-compress 59 5436 0.010854
10 commons-logging 21 1826 0.011501
3 commons-beanutils 26 2196 0.011840
5 commons-codec 41 3172 0.012926
23 jakartaee-jaxrs-api 5 380 0.013158
6 commons-collections 65 4856 0.013386
33 slf4j-api 22 1638 0.013431

Hotspots in Impacted Packages and Types¶

In [14]:
pkg_hotspots = (
    bcs_df.dropna(subset=["impacted_package_fqn"]) 
    .groupby(["library", "impacted_package_fqn"]).size().rename("count").reset_index()
    .sort_values("count", ascending=False)
)

type_hotspots = (
    bcs_df.dropna(subset=["impacted_type_fqn"]) 
    .groupby(["library", "impacted_type_fqn"]).size().rename("count").reset_index()
    .sort_values("count", ascending=False)
)

print("Top impacted packages across all libraries:")
display(pkg_hotspots.head(20))

print("Top impacted types across all libraries:")
display(type_hotspots.head(20))
Top impacted packages across all libraries:
library impacted_package_fqn count
77 assertj-core org.assertj.core.api 180600
1215 jackson-databind com.fasterxml.jackson.databind.deser.std 57955
1371 rxjava-core io.reactivex.internal.operators.observable 39001
1367 rxjava-core io.reactivex.internal.operators.flowable 27820
1402 rxjava-core io.reactivex.rxjava3.internal.operators.flowable 12002
1210 jackson-databind com.fasterxml.jackson.databind.deser 11785
1247 jackson-databind tools.jackson.databind.deser.jdk 10886
1212 jackson-databind com.fasterxml.jackson.databind.deser.impl 8228
1091 hibernate-core org.hibernate.type 7595
1217 jackson-databind com.fasterxml.jackson.databind.ext 7092
1447 rxjava-core rx.subjects 6323
278 assertj-core org.fest.assertions.api 6216
1235 jackson-databind com.fasterxml.jackson.databind.ser.std 5951
1420 rxjava-core io.reactivex.subjects 5095
715 h2-database org.h2.value 4518
1373 rxjava-core io.reactivex.internal.operators.single 4395
220 assertj-core org.assertj.core.condition 4086
1366 rxjava-core io.reactivex.internal.operators.completable 3793
128 assertj-core org.assertj.core.api.objectarray 3595
1219 jackson-databind com.fasterxml.jackson.databind.ext.jdk8 3207
Top impacted types across all libraries:
library impacted_type_fqn count
1764 assertj-core org.assertj.core.api.BDDAssertions 6043
1646 assertj-core org.assertj.core.api.Assertions 4219
20579 jackson-databind com.fasterxml.jackson.databind.deser.BeanDeser... 3500
20852 jackson-databind com.fasterxml.jackson.databind.deser.std.Throw... 3421
2029 assertj-core org.assertj.core.api.WithAssertions 2934
20585 jackson-databind com.fasterxml.jackson.databind.deser.BuilderBa... 2911
20580 jackson-databind com.fasterxml.jackson.databind.deser.BeanDeser... 2726
1597 assertj-core org.assertj.core.api.AbstractIterableAssert 2710
1600 assertj-core org.assertj.core.api.AbstractListAssert 2648
20649 jackson-databind com.fasterxml.jackson.databind.deser.impl.Bean... 2601
1952 assertj-core org.assertj.core.api.ObjectArrayAssert 2530
20648 jackson-databind com.fasterxml.jackson.databind.deser.impl.Bean... 2492
1824 assertj-core org.assertj.core.api.ConcreteIterableAssert 2472
1892 assertj-core org.assertj.core.api.IterableAssert 2395
1925 assertj-core org.assertj.core.api.ListAssert 2323
1610 assertj-core org.assertj.core.api.AbstractObjectArrayAssert 2302
1855 assertj-core org.assertj.core.api.FactoryBasedNavigableIter... 1939
1811 assertj-core org.assertj.core.api.ClassBasedNavigableIterab... 1938
1856 assertj-core org.assertj.core.api.FactoryBasedNavigableList... 1906
1911 assertj-core org.assertj.core.api.Java6Assertions 1889

Consolidated Summary Table¶

In [15]:
summary = coverage.reset_index().merge(
    flags[["library", "excluded_bcs_share", "internal_removals_share", "deprecated_removals_share"]],
    on="library",
    how="left",
).merge(
    pareto[["library", "share_of_commits_for_80pct_breaks"]],
    on="library",
    how="left",
)

summary["breaks_per_1k_final_exported"] = (
    summary["total_reported_breaks"] / summary["final_exported_symbols"].replace(0, np.nan) * 1000
).fillna(0)

summary = summary.sort_values(["total_reported_breaks", "commits"], ascending=[False, False])
summary.head(25)
Out[15]:
library commits first_commit last_commit commits_with_breaks total_reported_breaks final_exported_symbols final_internal_symbols merge_commit_rate java_change_rate bc_rows bc_count_delta excluded_bcs_share internal_removals_share deprecated_removals_share share_of_commits_for_80pct_breaks breaks_per_1k_final_exported
0 assertj-core 3672 2010-09-07 04:06:36+00:00 2026-02-07 22:12:04+00:00 1001 230771 6932 85 0.049564 0.512527 230771 0 0.042029 0.030303 0.006054 0.043845 33290.680900
1 jackson-databind 6582 2011-12-23 08:31:35+00:00 2026-02-08 04:52:24+00:00 2007 145244 9122 0 0.421908 0.822546 145244 0 0.000103 0.000103 0.020669 0.024157 15922.385442
2 rxjava-core 3499 2013-01-09 06:21:43+00:00 2026-02-06 08:29:11+00:00 541 128110 4254 1850 0.335810 0.661332 128110 0 0.787878 0.021645 0.014667 0.014290 30115.185708
3 hibernate-core 4826 2010-10-11 19:41:47+00:00 2016-08-10 05:41:04+00:00 846 34244 25810 9464 0.016162 0.846042 34244 0 0.156787 0.079634 0.021318 0.018027 1326.772569
4 h2-database 8024 2006-12-15 00:12:44+00:00 2026-01-31 05:09:18+00:00 1384 34007 10719 0 0.192547 0.871386 34007 0 0.000000 0.000000 0.000706 0.018569 3172.590727
5 tomcat 27700 2006-03-27 13:53:46+00:00 2026-02-06 22:39:49+00:00 3287 33514 23836 0 0.002310 0.750433 33514 0 0.000000 0.000000 0.054336 0.023899 1406.024501
6 commons-collections 4856 2001-04-14 15:38:58+00:00 2026-02-06 14:23:17+00:00 406 9963 3713 0 0.022035 0.678336 9963 0 0.000000 0.000000 0.001807 0.013386 2683.274980
7 jackson-core 2572 2011-12-23 07:00:40+00:00 2026-02-07 21:49:29+00:00 424 7384 3002 0 0.422628 0.683904 7384 0 0.000000 0.000000 0.044556 0.027994 2459.693538
8 joda-time 2132 2003-12-16 21:39:27+00:00 2026-01-21 20:45:38+00:00 178 3676 3045 0 0.043152 0.568011 3676 0 0.000000 0.000000 0.000000 0.018762 1207.224959
9 netty-codec-http 9661 2011-12-28 10:44:04+00:00 2026-02-06 10:10:55+00:00 178 3392 2654 0 0.008281 0.840182 3392 0 0.000000 0.000000 0.018868 0.004451 1278.070836
10 commons-pool 2804 2001-04-14 16:40:29+00:00 2026-02-06 14:39:30+00:00 240 3350 609 0 0.016762 0.529601 3350 0 0.000000 0.000000 0.045373 0.025321 5500.821018
11 httpcomponents-client 3170 2009-03-01 16:36:52+00:00 2026-02-07 18:00:09+00:00 264 3116 2382 77 0.004416 0.782334 3116 0 0.039795 0.007060 0.068678 0.014826 1308.144416
12 commons-beanutils 2196 2001-03-27 05:25:57+00:00 2026-02-06 14:21:23+00:00 133 2285 758 0 0.013661 0.518215 2285 0 0.000000 0.000000 0.041575 0.011840 3014.511873
13 commons-lang 8614 2002-07-19 03:35:56+00:00 2026-02-07 20:01:43+00:00 396 2274 4284 0 0.024379 0.717088 2274 0 0.000000 0.000000 0.000440 0.014163 530.812325
14 guava 6972 2011-04-15 17:22:23+00:00 2026-02-07 00:34:41+00:00 378 2037 5074 39 0.000143 0.897734 2037 0 0.730486 0.092784 0.136475 0.016495 401.458415
15 fastjson2-core 4597 2022-04-17 05:16:05+00:00 2026-02-08 00:22:56+00:00 367 2000 4910 452 0.048727 0.774636 2000 0 0.075500 0.068500 0.001500 0.026539 407.331976
16 fastjson-core 2934 2011-07-31 12:05:24+00:00 2023-05-12 06:16:03+00:00 322 1829 2124 0 0.140082 0.914110 1829 0 0.000000 0.000000 0.004921 0.035446 861.111111
17 log4j-api 9835 2013-08-26 12:21:54+00:00 2026-01-22 10:08:42+00:00 145 1546 2200 90 0.032028 0.614540 1546 0 0.041397 0.007762 0.000647 0.002644 702.727273
18 commons-compress 5436 2003-11-23 20:07:47+00:00 2026-02-06 14:23:42+00:00 213 1370 3706 27 0.017476 0.733996 1370 0 0.000730 0.000730 0.000000 0.010854 369.670804
19 commons-io 5560 2002-01-26 02:47:42+00:00 2026-02-07 21:09:27+00:00 187 1209 2505 0 0.016187 0.680576 1209 0 0.000000 0.000000 0.001654 0.008993 482.634731
20 commons-cli 1838 2002-06-10 18:01:16+00:00 2026-02-06 14:22:14+00:00 87 1062 512 0 0.026115 0.431447 1062 0 0.000000 0.000000 0.000000 0.008705 2074.218750
21 commons-codec 3172 2003-04-25 17:51:00+00:00 2026-02-07 14:04:49+00:00 146 932 824 0 0.018600 0.559269 932 0 0.000000 0.000000 0.008584 0.012926 1131.067961
22 jsoup 2163 2011-07-02 11:11:39+00:00 2026-02-03 05:07:10+00:00 142 919 1285 80 0.041147 0.714286 919 0 0.015234 0.008705 0.055495 0.023578 715.175097
23 protobuf-java 1151 2008-07-10 02:12:20+00:00 2016-10-20 00:33:59+00:00 28 562 1706 0 0.550825 0.099044 562 0 0.000000 0.000000 0.007117 0.008688 329.425557
24 gson 1977 2008-09-01 03:13:32+00:00 2026-02-01 17:06:30+00:00 108 470 340 0 0.070309 0.669196 470 0 0.751064 0.257447 0.000000 0.017198 1382.352941

Commit Context of Breaking Changes¶

This section links *-bcs.csv rows back to commit metadata to understand the context in which breaking changes are introduced.

In [16]:
# Link BC rows to commit-level context
commit_context_cols = [
    "library", "commit_sha", "date_utc", "is_merge_commit", "has_java_changes", "has_pom_changes",
    "files_changed", "abs_loc_churn", "days_since_prev_commit", "breaking_changes_count",
    "binary_breaking_changes_count", "source_breaking_changes_count", "tag", "version", "branch",
    "exported_symbols_count", "internal_share", "deprecated_share",
]

commit_ctx = commits_df[commit_context_cols].copy()
commit_ctx = commit_ctx.rename(columns={"commit_sha": "commit"})

bcs_enriched = bcs_df.merge(commit_ctx, on=["library", "commit"], how="left", validate="many_to_one")

# Commit-level BC event counts from row-level table
bc_rows_per_commit = (
    bcs_enriched.groupby(["library", "commit"]).size().rename("bc_rows").reset_index()
)

commit_enriched = commit_ctx.merge(bc_rows_per_commit, on=["library", "commit"], how="left")
commit_enriched["bc_rows"] = commit_enriched["bc_rows"].fillna(0)
commit_enriched["has_bc_rows"] = commit_enriched["bc_rows"] > 0

# Useful derived features
commit_enriched["is_tagged_commit"] = commit_enriched["tag"].notna() | commit_enriched["version"].notna()
commit_enriched["binary_share_in_commit"] = (
    commit_enriched["binary_breaking_changes_count"] /
    commit_enriched["breaking_changes_count"].replace(0, np.nan)
).fillna(0)

print("Enriched BC rows:", len(bcs_enriched))
print("Commits with at least one BC row:", int(commit_enriched["has_bc_rows"].sum()))
print("Share of commits with BC rows:", round(commit_enriched["has_bc_rows"].mean(), 4))
Enriched BC rows: 658235
Commits with at least one BC row: 13868
Share of commits with BC rows: 0.0948
In [17]:
# Compare contexts: breaking vs non-breaking commits
context_summary = commit_enriched.groupby("has_bc_rows").agg(
    commits=("commit", "count"),
    median_files_changed=("files_changed", "median"),
    median_abs_churn=("abs_loc_churn", "median"),
    median_days_since_prev=("days_since_prev_commit", "median"),
    merge_rate=("is_merge_commit", "mean"),
    java_change_rate=("has_java_changes", "mean"),
    pom_change_rate=("has_pom_changes", "mean"),
    tagged_commit_rate=("is_tagged_commit", "mean"),
).reset_index()

context_summary["has_bc_rows"] = context_summary["has_bc_rows"].map({False: "No BC", True: "Has BC"})
context_summary
Out[17]:
has_bc_rows commits median_files_changed median_abs_churn median_days_since_prev merge_rate java_change_rate pom_change_rate tagged_commit_rate
0 No BC 132466 1.0 14.0 0.0 0.071226 0.677593 0.115803 0.009361
1 Has BC 13868 5.0 133.0 0.0 0.135203 1.000000 0.024445 0.001947
In [18]:
# Break probability as a function of churn and inactivity
work = commit_enriched.copy()
work["churn_decile"] = pd.qcut(
    work["abs_loc_churn"].rank(method="first"),
    q=10,
    labels=[f"D{i}" for i in range(1, 11)]
)
work["inactivity_bucket"] = pd.cut(
    work["days_since_prev_commit"],
    bins=[-0.1, 0, 1, 3, 7, 30, 365, np.inf],
    labels=["0d", "1d", "2-3d", "4-7d", "8-30d", "31-365d", ">365d"],
)

p1 = work.groupby("churn_decile", observed=False)["has_bc_rows"].mean().reset_index(name="break_commit_rate")
p2 = work.groupby("inactivity_bucket", observed=False)["has_bc_rows"].mean().reset_index(name="break_commit_rate")

fig, axes = plt.subplots(1, 2, figsize=(16, 6))
sns.barplot(data=p1, x="churn_decile", y="break_commit_rate", color="#e76f51", ax=axes[0])
axes[0].set_title("Probability a commit has BCs by churn decile")
axes[0].set_xlabel("Churn decile")
axes[0].set_ylabel("P(commit has BC rows)")

sns.barplot(data=p2, x="inactivity_bucket", y="break_commit_rate", color="#264653", ax=axes[1])
axes[1].set_title("Probability a commit has BCs by inactivity")
axes[1].set_xlabel("Days since previous commit")
axes[1].set_ylabel("P(commit has BC rows)")

plt.tight_layout()
plt.show()
No description has been provided for this image

Release/Tag Proximity Effects¶

Here, commits with a non-null tag or version are treated as release markers. We analyze if BCs cluster near these points.

In [19]:
def nearest_release_distance_days(lib_df: pd.DataFrame) -> pd.Series:
    d = lib_df.sort_values("date_utc").copy()
    release_dates = d.loc[d["tag"].notna() | d["version"].notna(), "date_utc"].drop_duplicates().sort_values()

    if release_dates.empty:
        return pd.Series(np.nan, index=d.index)

    release_ns = release_dates.astype("int64").to_numpy()
    commit_ns = d["date_utc"].astype("int64").to_numpy()
    idx = np.searchsorted(release_ns, commit_ns)

    left_idx = np.clip(idx - 1, 0, len(release_ns) - 1)
    right_idx = np.clip(idx, 0, len(release_ns) - 1)

    left_dist = np.abs(commit_ns - release_ns[left_idx])
    right_dist = np.abs(commit_ns - release_ns[right_idx])
    min_dist_days = np.minimum(left_dist, right_dist) / 1e9 / 86400

    out = pd.Series(min_dist_days, index=d.index)
    return out.reindex(lib_df.index)


commit_enriched["days_to_nearest_release"] = np.nan
for _lib, _idx in commit_enriched.groupby("library").groups.items():
    commit_enriched.loc[_idx, "days_to_nearest_release"] = nearest_release_distance_days(commit_enriched.loc[_idx]).values

release_bins = [-0.1, 0, 1, 3, 7, 30, 90, 365, np.inf]
release_labels = ["release day", "1d", "2-3d", "4-7d", "8-30d", "31-90d", "91-365d", ">365d"]

commit_enriched["release_proximity"] = pd.cut(
    commit_enriched["days_to_nearest_release"], bins=release_bins, labels=release_labels
)

release_effect = commit_enriched.groupby("release_proximity", observed=False).agg(
    commits=("commit", "count"),
    break_commit_rate=("has_bc_rows", "mean"),
    mean_breaks_per_commit=("bc_rows", "mean"),
).reset_index()

release_effect
Out[19]:
release_proximity commits break_commit_rate mean_breaks_per_commit
0 release day 1277 0.021143 2.000000
1 1d 7766 0.070306 3.789596
2 2-3d 6090 0.105911 6.803777
3 4-7d 8765 0.098688 6.581289
4 8-30d 19751 0.098274 9.893778
5 31-90d 15486 0.104223 9.643936
6 91-365d 19885 0.059844 2.670405
7 >365d 39614 0.094739 2.417277
In [20]:
fig, axes = plt.subplots(1, 2, figsize=(17, 6))

sns.barplot(data=release_effect, x="release_proximity", y="break_commit_rate", color="#2a9d8f", ax=axes[0])
axes[0].set_title("Break-commit rate by distance to nearest release marker")
axes[0].set_xlabel("Distance to nearest release/tag commit")
axes[0].set_ylabel("P(commit has BC rows)")
axes[0].tick_params(axis="x", rotation=30)

sns.barplot(data=release_effect, x="release_proximity", y="mean_breaks_per_commit", color="#f4a261", ax=axes[1])
axes[1].set_title("Mean BC rows per commit by release proximity")
axes[1].set_xlabel("Distance to nearest release/tag commit")
axes[1].set_ylabel("Mean BC rows per commit")
axes[1].tick_params(axis="x", rotation=30)

plt.tight_layout()
plt.show()
No description has been provided for this image

Commit Intent and Evolution Style¶

We derive lightweight intent buckets from commit messages and conventional tags to compare evolution styles and BC policies.

In [21]:
def infer_intent(row):
    tag = str(row.get("conventional_commit_tag", "") or "").strip().lower()
    msg = str(row.get("commit_short_msg", "") or "").strip().lower()
    text = f"{tag} {msg}"

    rules = [
        ("release", ["release", "version", "bump version", "prepare", "tag"]),
        ("revert", ["revert"]),
        ("deps", ["dependenc", "upgrade", "bump", "renovate"]),
        ("build", ["pom", "gradle", "maven", "build", "ci", "workflow"]),
        ("docs", ["readme", "doc", "javadoc", "documentation"]),
        ("test", ["test", "assert", "spec"]),
        ("fix", ["fix", "bug", "issue", "patch", "hotfix"]),
        ("refactor", ["refactor", "cleanup", "simplify", "rename"]),
        ("feature", ["feat", "feature", "add", "introduce", "implement"]),
        ("breaking", ["breaking", "remove", "drop", "deprecat", "api change"]),
    ]

    for label, kws in rules:
        if any(k in text for k in kws):
            return label
    return "other"


intent_df = commit_enriched.merge(
    commits_df[["library", "commit_sha", "commit_short_msg", "conventional_commit_tag"]],
    left_on=["library", "commit"],
    right_on=["library", "commit_sha"],
    how="left",
)

intent_df["intent"] = intent_df.apply(infer_intent, axis=1)

intent_stats = intent_df.groupby(["library", "intent"]).agg(
    commits=("commit", "count"),
    break_commit_rate=("has_bc_rows", "mean"),
    mean_bc_rows=("bc_rows", "mean"),
).reset_index()

intent_stats["intent_share"] = intent_stats.groupby("library")["commits"].transform(lambda s: s / s.sum())

intent_stats.head(20)
Out[21]:
library intent commits break_commit_rate mean_bc_rows intent_share
0 JSON-java breaking 7 0.142857 0.142857 0.017995
1 JSON-java build 21 0.047619 7.142857 0.053985
2 JSON-java deps 2 0.000000 0.000000 0.005141
3 JSON-java docs 42 0.000000 0.000000 0.107969
4 JSON-java feature 7 0.142857 0.428571 0.017995
5 JSON-java fix 66 0.030303 0.060606 0.169666
6 JSON-java other 181 0.000000 0.000000 0.465296
7 JSON-java refactor 6 0.166667 0.500000 0.015424
8 JSON-java release 27 0.000000 0.000000 0.069409
9 JSON-java revert 6 0.166667 2.500000 0.015424
10 JSON-java test 24 0.000000 0.000000 0.061697
11 assertj-core breaking 106 0.415094 67.849057 0.028867
12 assertj-core build 169 0.248521 86.201183 0.046024
13 assertj-core deps 753 0.007968 17.540505 0.205065
14 assertj-core docs 231 0.290043 77.510823 0.062908
15 assertj-core feature 122 0.319672 62.647541 0.033224
16 assertj-core fix 266 0.421053 56.755639 0.072440
17 assertj-core other 426 0.415493 163.873239 0.116013
18 assertj-core refactor 67 0.597015 55.313433 0.018246
19 assertj-core release 643 0.024883 11.555210 0.175109
In [22]:
# Evolution style map: intent composition (share) + policy map (break rate by intent)
selected_intents = ["feature", "fix", "refactor", "deps", "build", "release", "breaking", "docs", "test", "other"]
plot_intent = intent_stats[intent_stats["intent"].isin(selected_intents)].copy()

heat_share = plot_intent.pivot(index="library", columns="intent", values="intent_share").fillna(0)
heat_break_rate = plot_intent.pivot(index="library", columns="intent", values="break_commit_rate").fillna(0)

lib_order = coverage.sort_values("commits", ascending=False).index
heat_share = heat_share.reindex(lib_order)
heat_break_rate = heat_break_rate.reindex(lib_order)

fig, axes = plt.subplots(1, 2, figsize=(22, 10))
sns.heatmap(heat_share, cmap="Blues", ax=axes[0], cbar_kws={"label": "Share of commits"})
axes[0].set_title("Intent composition by library")
axes[0].set_xlabel("Intent")
axes[0].set_ylabel("Library")

sns.heatmap(heat_break_rate, cmap="Reds", ax=axes[1], cbar_kws={"label": "P(commit has BC rows)"})
axes[1].set_title("Break propensity by intent and library")
axes[1].set_xlabel("Intent")
axes[1].set_ylabel("")

plt.tight_layout()
plt.show()
No description has been provided for this image

Library Profiles and Comparative Mapping¶

We aggregate multi-dimensional features to characterize library evolution profiles and compare policy/style differences.

In [23]:
# Build profile features leveraging both commits and BC rows
nature_stats = bcs_df.groupby(["library", "nature"]).size().unstack(fill_value=0)
kind_diversity = bcs_df.groupby("library")["kind"].nunique().rename("bc_kind_diversity")
visibility_public_share = (
    bcs_df.assign(is_public=bcs_df["symbol_visibility"].eq("public"))
    .groupby("library")["is_public"].mean()
    .rename("public_visibility_share")
)

monthly_break_vol = (
    commits_df.assign(month=commits_df["date_utc"].dt.tz_convert(None).dt.to_period("M").astype(str))
    .groupby(["library", "month"])["breaking_changes_count"].sum()
    .groupby("library").std()
    .fillna(0)
    .rename("monthly_break_std")
)

profile = commit_enriched.groupby("library").agg(
    commits=("commit", "count"),
    share_break_commits=("has_bc_rows", "mean"),
    mean_bc_rows_per_commit=("bc_rows", "mean"),
    median_churn=("abs_loc_churn", "median"),
    median_churn_break_commits=("abs_loc_churn", lambda s: s[commit_enriched.loc[s.index, "has_bc_rows"]].median() if commit_enriched.loc[s.index, "has_bc_rows"].any() else 0),
    merge_rate=("is_merge_commit", "mean"),
    tagged_commit_rate=("is_tagged_commit", "mean"),
    pom_change_rate=("has_pom_changes", "mean"),
    java_change_rate=("has_java_changes", "mean"),
    median_days_since_prev=("days_since_prev_commit", "median"),
    mean_internal_share=("internal_share", "mean"),
    mean_deprecated_share=("deprecated_share", "mean"),
    mean_binary_share_per_breaking_commit=("binary_share_in_commit", "mean"),
).join(flags.set_index("library")[["excluded_bcs_share", "internal_removals_share", "deprecated_removals_share"]], how="left")

profile = profile.join(kind_diversity, how="left")
profile = profile.join(visibility_public_share, how="left")
profile = profile.join(monthly_break_vol, how="left")

for col in ["deletion", "mutation", "addition"]:
    if col not in nature_stats.columns:
        nature_stats[col] = 0
nature_shares = nature_stats[["deletion", "mutation", "addition"]].div(nature_stats.sum(axis=1).replace(0, np.nan), axis=0).fillna(0)
nature_shares.columns = ["nature_deletion_share", "nature_mutation_share", "nature_addition_share"]
profile = profile.join(nature_shares, how="left")

profile["bc_per_100_commits"] = profile["mean_bc_rows_per_commit"] * 100
profile["break_churn_multiplier"] = profile["median_churn_break_commits"] / profile["median_churn"].replace(0, np.nan)
profile["break_churn_multiplier"] = profile["break_churn_multiplier"].replace([np.inf, -np.inf], np.nan).fillna(0)

profile.head(20)
Out[23]:
commits share_break_commits mean_bc_rows_per_commit median_churn median_churn_break_commits merge_rate tagged_commit_rate pom_change_rate java_change_rate median_days_since_prev mean_internal_share mean_deprecated_share mean_binary_share_per_breaking_commit excluded_bcs_share internal_removals_share deprecated_removals_share bc_kind_diversity public_visibility_share monthly_break_std nature_deletion_share nature_mutation_share nature_addition_share bc_per_100_commits break_churn_multiplier
library
JSON-java 389 0.017995 0.452442 24.0 692.0 0.658098 0.064267 0.084833 0.735219 4.0 0.000000 0.000275 0.015544 0.000000 0.000000 0.000000 7 0.982955 13.581015 0.159091 0.840909 0.000000 45.244216 28.833333
assertj-core 3672 0.272603 62.846133 10.0 261.0 0.049564 0.016885 0.395969 0.512527 0.0 0.179138 0.007116 0.255184 0.042029 0.030303 0.006054 24 0.875786 2444.442170 0.592410 0.389741 0.017849 6284.613290 26.100000
awaitility 630 0.063492 0.415873 6.0 199.5 0.057143 0.060317 0.325397 0.298413 0.0 0.000000 0.000334 0.050452 0.000000 0.000000 0.003817 14 1.000000 9.565623 0.648855 0.297710 0.053435 41.587302 33.250000
commons-beanutils 2196 0.060565 1.040528 8.0 75.0 0.013661 0.000455 0.141166 0.518215 0.0 0.000000 0.010436 0.053713 0.000000 0.000000 0.041575 21 0.820569 53.498194 0.466083 0.525602 0.008315 104.052823 9.375000
commons-cli 1838 0.047334 0.577802 7.0 120.0 0.026115 0.001632 0.152884 0.431447 0.0 0.000000 0.049088 0.034920 0.000000 0.000000 0.000000 21 0.953861 36.877658 0.715631 0.245763 0.038606 57.780196 17.142857
commons-codec 3172 0.046028 0.293821 8.0 69.5 0.018600 0.001892 0.139029 0.559269 0.0 0.000000 0.058108 0.038974 0.000000 0.000000 0.008584 16 0.907725 20.824764 0.680258 0.313305 0.006438 29.382093 8.687500
commons-collections 4856 0.083608 2.051689 14.0 108.0 0.022035 0.001030 0.091639 0.678336 0.0 0.000000 0.003226 0.070279 0.000000 0.000000 0.001807 25 0.821841 316.188783 0.430192 0.552645 0.017164 205.168863 7.714286
commons-compress 5436 0.039183 0.252024 10.0 79.0 0.017476 0.001104 0.093635 0.733996 0.0 0.001397 0.014987 0.029115 0.000730 0.000730 0.000000 17 0.805109 21.136561 0.524818 0.452555 0.022628 25.202355 7.900000
commons-io 5560 0.033633 0.217446 9.0 145.0 0.016187 0.001799 0.107734 0.680576 0.0 0.000000 0.031862 0.028655 0.000000 0.000000 0.001654 20 0.892473 24.876858 0.493797 0.434243 0.071960 21.744604 16.111111
commons-lang 8614 0.045972 0.263989 10.0 105.5 0.024379 0.001509 0.086951 0.717088 0.0 0.000000 0.018381 0.040930 0.000000 0.000000 0.000440 22 0.933157 24.821353 0.619613 0.359719 0.020668 26.398886 10.550000
commons-logging 1826 0.033406 0.197700 8.0 92.0 0.036692 0.002738 0.171961 0.365279 0.0 0.000000 0.018995 0.027149 0.000000 0.000000 0.000000 13 0.850416 11.760604 0.764543 0.196676 0.038781 19.769989 11.500000
commons-pool 2804 0.085592 1.194722 8.0 81.5 0.016762 0.007133 0.166904 0.529601 0.0 0.000000 0.020874 0.073684 0.000000 0.000000 0.045373 17 0.862388 80.660140 0.640000 0.322985 0.037015 119.472183 10.187500
commons-text 2322 0.020241 0.123600 5.0 336.0 0.071921 0.004307 0.261413 0.444875 0.0 0.000428 0.012485 0.017980 0.010453 0.010453 0.048780 13 0.954704 10.949225 0.801394 0.135889 0.062718 12.360034 67.200000
fastjson-core 2934 0.109748 0.623381 36.0 165.5 0.140082 0.029993 0.103954 0.914110 0.0 0.000000 0.018740 0.101410 0.000000 0.000000 0.004921 23 0.953527 67.188583 0.827775 0.141607 0.030618 62.338105 4.597222
fastjson2-core 4597 0.079835 0.435066 41.0 288.0 0.048727 0.013922 0.219056 0.774636 0.0 0.046233 0.002812 0.069596 0.075500 0.068500 0.001500 20 0.862000 71.829450 0.620000 0.348500 0.031500 43.506635 7.024390
gson 1977 0.054628 0.237734 20.0 120.0 0.070309 0.013151 0.267577 0.669196 0.0 0.109508 0.016023 0.050835 0.751064 0.257447 0.000000 19 0.985106 14.360114 0.651064 0.338298 0.010638 23.773394 6.000000
guava 6972 0.054217 0.292169 31.0 122.5 0.000143 0.000717 0.055221 0.897734 0.0 0.032814 0.028148 0.040945 0.730486 0.092784 0.136475 20 0.974472 30.668046 0.548846 0.349043 0.102111 29.216867 3.951613
h2-database 8024 0.172483 4.238161 37.0 271.5 0.192547 0.002493 0.008724 0.871386 0.0 0.000000 0.000336 0.152342 0.000000 0.000000 0.000706 25 0.899285 471.502985 0.593995 0.386685 0.019320 423.816052 7.337838
hamcrest-core 621 0.077295 0.700483 20.0 80.5 0.069243 0.025765 0.000000 0.611916 0.0 0.053894 0.005599 0.057801 0.018391 0.018391 0.002299 13 0.937931 14.638984 0.434483 0.514943 0.050575 70.048309 4.025000
hibernate-core 4826 0.175300 7.095731 53.0 328.0 0.016162 0.009946 0.013054 0.846042 0.0 0.240281 0.007952 0.127562 0.156787 0.079634 0.021318 27 0.898026 1351.930108 0.666365 0.198341 0.135294 709.573145 6.188679
In [24]:
# Profile map via SVD on standardized features (no external ML deps)
feature_cols = [
    "share_break_commits", "bc_per_100_commits", "merge_rate", "tagged_commit_rate", "pom_change_rate",
    "mean_internal_share", "mean_deprecated_share", "excluded_bcs_share", "internal_removals_share",
    "deprecated_removals_share", "bc_kind_diversity", "public_visibility_share", "monthly_break_std",
    "nature_deletion_share", "nature_mutation_share", "nature_addition_share", "break_churn_multiplier",
]

X = profile[feature_cols].copy().fillna(0)
Xz = (X - X.mean(axis=0)) / X.std(axis=0).replace(0, 1)
U, S, Vt = np.linalg.svd(Xz.values, full_matrices=False)
coords = pd.DataFrame(U[:, :2] * S[:2], index=Xz.index, columns=["profile_x", "profile_y"])

profile_map = profile.join(coords)

plt.figure(figsize=(14, 10))
sns.scatterplot(
    data=profile_map.reset_index(),
    x="profile_x",
    y="profile_y",
    size="bc_per_100_commits",
    hue="share_break_commits",
    palette="viridis",
    sizes=(50, 500),
    alpha=0.85,
)
for row in profile_map.reset_index().itertuples(index=False):
    plt.text(row.profile_x, row.profile_y, row.library, fontsize=9, alpha=0.85)

plt.title("Library profile map (dimension-reduced style/policy features)")
plt.xlabel("Profile axis 1")
plt.ylabel("Profile axis 2")
plt.legend(bbox_to_anchor=(1.02, 1), loc="upper left")
plt.tight_layout()
plt.show()
No description has been provided for this image
In [25]:
# Z-score comparison heatmap across all libraries
profile_z = Xz.loc[coverage.sort_values("commits", ascending=False).index]

plt.figure(figsize=(20, max(10, 0.45 * len(profile_z))))
sns.heatmap(profile_z, cmap="coolwarm", center=0, cbar_kws={"label": "Feature z-score"})
plt.title("Comparative profile heatmap (all libraries)")
plt.xlabel("Profile features")
plt.ylabel("Library")
plt.tight_layout()
plt.show()
No description has been provided for this image

Where Breaking Changes Concentrate¶

Concentration metrics show whether BCs are spread across many packages/types or focused in a few hotspots.

In [26]:
def hhi_from_counts(counts: pd.Series) -> float:
    total = counts.sum()
    if total <= 0:
        return np.nan
    shares = counts / total
    return float((shares ** 2).sum())

pkg_conc = bcs_df.groupby(["library", "impacted_package_fqn"]).size().rename("count").reset_index()

pkg_hhi = pkg_conc.groupby("library")["count"].apply(hhi_from_counts).rename("package_hhi")
pkg_unique = pkg_conc.groupby("library")["impacted_package_fqn"].nunique().rename("unique_impacted_packages")

type_conc = bcs_df.groupby(["library", "impacted_type_fqn"]).size().rename("count").reset_index()
type_hhi = type_conc.groupby("library")["count"].apply(hhi_from_counts).rename("type_hhi")
type_unique = type_conc.groupby("library")["impacted_type_fqn"].nunique().rename("unique_impacted_types")

conc = (
    coverage[["bc_rows"]]
    .join(pkg_hhi, how="left")
    .join(pkg_unique, how="left")
    .join(type_hhi, how="left")
    .join(type_unique, how="left")
    .reset_index()
)

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

sns.scatterplot(
    data=conc,
    x="bc_rows",
    y="package_hhi",
    size="unique_impacted_packages",
    hue="unique_impacted_packages",
    palette="mako",
    sizes=(30, 350),
    ax=axes[0],
)
axes[0].set_xscale("log")
axes[0].set_title("Package concentration vs BC volume")
axes[0].set_xlabel("BC rows (log scale)")
axes[0].set_ylabel("HHI of impacted package distribution")

sns.scatterplot(
    data=conc,
    x="bc_rows",
    y="type_hhi",
    size="unique_impacted_types",
    hue="unique_impacted_types",
    palette="crest",
    sizes=(30, 350),
    ax=axes[1],
)
axes[1].set_xscale("log")
axes[1].set_title("Type concentration vs BC volume")
axes[1].set_xlabel("BC rows (log scale)")
axes[1].set_ylabel("HHI of impacted type distribution")

plt.tight_layout()
plt.show()

conc.sort_values("package_hhi", ascending=False).head(15)
No description has been provided for this image
Out[26]:
library bc_rows package_hhi unique_impacted_packages type_hhi unique_impacted_types
33 JSON-java 176 1.000000 1 0.118221 15
23 protobuf-java 562 1.000000 1 0.055426 54
26 hamcrest-core 435 0.691552 4 0.053687 51
0 assertj-core 230771 0.614291 354 0.004927 10189
24 gson 470 0.446781 8 0.068139 103
27 jakartaee-validation 416 0.316753 11 0.018618 161
31 slf4j-api 249 0.314559 5 0.053370 49
8 joda-time 3676 0.308485 12 0.012665 233
28 commons-logging 361 0.305975 18 0.031177 110
10 commons-pool 3350 0.278548 12 0.024256 228
17 log4j-api 1546 0.273850 9 0.110659 131
9 netty-codec-http 3392 0.266669 7 0.012840 226
22 jsoup 919 0.238523 11 0.039584 146
15 fastjson2-core 2000 0.232882 26 0.023001 259
34 jakartaee-servlet-api 131 0.231047 7 0.018938 97
In [31]:
# Public/protected + excluded/internal interaction matrix by kind
interaction = bcs_df.copy()
interaction["group"] = np.select(
    [
        interaction["is_internal_removal"] & interaction["is_excluded_symbol"],
        interaction["is_internal_removal"],
        interaction["is_excluded_symbol"],
        interaction["is_deprecated_removal"],
    ],
    ["internal+excluded", "internal", "excluded", "deprecated-removal"],
    default="regular",
)

top_kinds = interaction["kind"].value_counts().index
mat = (
    interaction[interaction["kind"].isin(top_kinds)]
    .groupby(["kind", "group"]).size().rename("count").reset_index()
)
mat["share"] = mat.groupby("kind")["count"].transform(lambda s: s / s.sum())
heat = mat.pivot(index="kind", columns="group", values="share").fillna(0)

plt.figure(figsize=(12, 12))
sns.heatmap(heat, cmap="OrRd", annot=True, fmt=".2f", cbar_kws={"label": "Share within kind"})
plt.xticks(rotation=90)
plt.title("How BC kinds split across regular/excluded/internal/deprecated groups")
plt.xlabel("BC group")
plt.ylabel("BC kind")
plt.tight_layout()
plt.show()
No description has been provided for this image

Optional: Per-Library Deep Dive Helper¶

Set LIB_TO_INSPECT to quickly visualize the full timeline and BC composition of a specific library.

In [28]:
LIB_TO_INSPECT = "guava"  # change as needed

dc = commits_df[commits_df["library"] == LIB_TO_INSPECT].sort_values("date_utc").copy()
db = bcs_enriched[bcs_enriched["library"] == LIB_TO_INSPECT].copy()

if dc.empty:
    print(f"Library {LIB_TO_INSPECT} not found.")
else:
    fig, axes = plt.subplots(2, 2, figsize=(18, 11))

    axes[0, 0].plot(dc["date_utc"], dc["exported_symbols_count"], color="#1d3557", label="Exported")
    axes[0, 0].plot(dc["date_utc"], dc["internal_count"], color="#2a9d8f", label="Internal")
    axes[0, 0].set_title(f"{LIB_TO_INSPECT}: API size timeline")
    axes[0, 0].legend()

    axes[0, 1].plot(dc["date_utc"], dc["breaking_changes_count"].rolling(60, min_periods=5).mean(), color="#e76f51")
    axes[0, 1].set_title(f"{LIB_TO_INSPECT}: rolling BC count (60 commits)")

    kind_counts = db["kind"].value_counts().head(10)
    sns.barplot(x=kind_counts.values, y=kind_counts.index, ax=axes[1, 0], color="#457b9d")
    axes[1, 0].set_title("Top BC kinds")
    axes[1, 0].set_xlabel("Count")
    axes[1, 0].set_ylabel("")

    comp_counts = db["compatibility"].value_counts(dropna=False)
    axes[1, 1].pie(comp_counts.values, labels=comp_counts.index, autopct="%.1f%%")
    axes[1, 1].set_title("Compatibility split")

    plt.tight_layout()
    plt.show()
No description has been provided for this image